﻿Code Snippet for loading data from a relational db into Snowflake


import pandas as pd
from sqlalchemy import create_engine
import logging
from datetime import datetime


# Connect to the source database and extract yesterday's transactions
# In a production environment, you would configure this connection properly
# and potentially include more sophisticated filtering logic
transactions_df = pd.read_sql(
    "SELECT * FROM transactions WHERE date = CURRENT_DATE - 1",
    source_database_connection
    # Note: 'date' here is a date field in the transactions table
    # This assumes a standard date column, but you might need to use
    # transaction_timestamp or another date/time field in your database
)


# Next, we'll set up our connection to Snowflake using SQLAlchemy. This provides a clean, # standardized way to interact with Snowflake:


# Create Snowflake connection using SQLAlchemy engine 
# For production environments, use environment variables or a secure credential store
engine = create_engine(
'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}&role={role}'.format(
        user='your_username',
        password='your_password',
        account='your_account',
        database='your_database',
        schema='your_schema',
        warehouse='your_warehouse',
        role='your_role'
    )
)


# Finally, we'll load the data into Snowflake with comprehensive error handling:
# This will first attempt to execute the code in the try block to load the data into Snowflake # and will be directed to the exception block in case of an error. 


# Write to Snowflake (includes error handling for retries)
try:
# The to_sql method will create or append to a table named 'daily_transactions' 
# The target table's schema will match our pandas DataFrame structure # Processing in chunks helps manage memory and enables better error recovery
     transactions_df.to_sql(
        'daily_transactions',
        engine,
        if_exists='append', # Adds new records without overwriting existing data
        index=False, # Don't include DataFrame index as a column
        chunksize=10000 # Process in smaller batches for better memory management
    )
except Exception as e:
    # Log the error with details
    error_message = f"Failed to load transactions: {str(e)}"
    logging.error(error_message)
    
    # Identify rows that aren't completely null (have at least one non-null value)
    # This helps isolate potentially problematic records
    failed_records = transactions_df[transactions_df.apply(lambda x: not x.isnull().all(), axis=1)]
    # Save failed records for review and later processing
failed_records.to_csv(f'failed_loads/transactions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
    # Notify the team
    send_alert(error_message)
    # If the error is transient (like a network timeout or connection issue),
    # we can attempt to reload the data. The retry_load function would implement
    # exponential backoff to avoid overwhelming the server.
    if is_retryable_error(e):
        retry_load(transactions_df, max_retries=3)